# Provides a session log for the packages used in this .rmd
library(sessioninfo)
session_info(pkgs = "!attached", to_file = "03a_session_log.txt")

1 Introduction

In this script, we go through all the pre-registered proposed analyses. As a reminder, the main research questions where as follows:

  1. To what extent does infants’ preference for IDS as measured in a laboratory setting predict their vocabulary at 18 and 24 months?
  2. Does the relation between IDS preference and vocabulary size change over development?
  3. Are there systematic differences in the strength of this relationship across the language communities in our sample?

Here we present the main “sample theory based” analyses (also known as frequentist), separately on the North American and UK samples in parallel to answer our first two research questions, then together to answer our third research question. In the next section (03b) provide additional Bayesian statistics where a null effect was found, as specified in the pre-registration.

TODO: Delete any libraries that are solely for the Bayes analysis and can be deleted here.

# Library imports, general settings ==============
library(tidyverse)
library(egg)
library(knitr)
library(lme4)
library(lmerTest)
library(simr)
# As in our discussion with Mike, we will use lmerTest for calculating p values
# in the mixed-effects models (noted as a deviation)
library(brms)
library(rstan)
library(future)
plan(multicore, workers = 8) # Swap to multiprocess if running from Rstudio
library(lattice)
library(effects)
library(sjPlot)
library(robustlmm)
library(car)

# Load model comparison functions
source("helper/lrtests.R")

# Deal with package priority issues
select <- dplyr::select

theme_set(theme_bw(base_size = 10))
options("future" = T)
# knitr::opts_chunk$set(cache = TRUE)

print(sessionInfo()) # listing all info about R and packages info
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] furrr_0.3.0          future.apply_1.9.0   bridgesampling_1.1-2
##  [4] car_3.1-0            robustlmm_3.0-1      sjPlot_2.8.10       
##  [7] effects_4.2-1        carData_3.0-5        lattice_0.20-45     
## [10] future_1.27.0        rstan_2.21.5         StanHeaders_2.21.0-7
## [13] brms_2.17.0          Rcpp_1.0.9           simr_1.0.6          
## [16] lmerTest_3.1-3       lme4_1.1-30          Matrix_1.4-1        
## [19] knitr_1.39           egg_0.4.5            gridExtra_2.3       
## [22] forcats_0.5.1        stringr_1.4.0        dplyr_1.0.9         
## [25] purrr_0.3.4          readr_2.1.2          tidyr_1.2.0         
## [28] tibble_3.1.8         ggplot2_3.3.6        tidyverse_1.3.2     
## [31] sessioninfo_1.2.2   
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.0         backports_1.4.1      plyr_1.8.7          
##   [4] igraph_1.3.4         splines_4.2.1        RLRsim_3.1-8        
##   [7] listenv_0.8.0        crosstalk_1.2.0      rstantools_2.2.0    
##  [10] inline_0.3.19        digest_0.6.29        htmltools_0.5.3     
##  [13] fansi_1.0.3          magrittr_2.0.3       checkmate_2.1.0     
##  [16] googlesheets4_1.0.0  tzdb_0.3.0           globals_0.15.1      
##  [19] modelr_0.1.8         RcppParallel_5.1.5   matrixStats_0.62.0  
##  [22] xts_0.12.1           prettyunits_1.1.1    colorspace_2.0-3    
##  [25] rvest_1.0.2          fastGHQuad_1.0.1     mitools_2.4         
##  [28] haven_2.5.0          xfun_0.31            callr_3.7.1         
##  [31] crayon_1.5.1         jsonlite_1.8.0       survival_3.3-1      
##  [34] zoo_1.8-10           iterators_1.0.14     glue_1.6.2          
##  [37] gtable_0.3.0         gargle_1.2.0         emmeans_1.7.5       
##  [40] sjstats_0.18.1       sjmisc_2.8.9         distributional_0.3.0
##  [43] pkgbuild_1.3.1       DEoptimR_1.0-11      abind_1.4-5         
##  [46] scales_1.2.0         mvtnorm_1.1-3        DBI_1.1.3           
##  [49] ggeffects_1.1.2      miniUI_0.1.1.1       plotrix_3.8-2       
##  [52] performance_0.9.1    xtable_1.8-4         survey_4.1-1        
##  [55] stats4_4.2.1         DT_0.23              datawizard_0.4.1    
##  [58] htmlwidgets_1.5.4    httr_1.4.3           threejs_0.3.3       
##  [61] posterior_1.2.2      ellipsis_0.3.2       pkgconfig_2.0.3     
##  [64] loo_2.5.1            farver_2.1.1         nnet_7.3-17         
##  [67] sass_0.4.2           dbplyr_2.2.1         binom_1.1-1.1       
##  [70] utf8_1.2.2           effectsize_0.7.0     tidyselect_1.1.2    
##  [73] rlang_1.0.4          reshape2_1.4.4       later_1.3.0         
##  [76] munsell_0.5.0        cellranger_1.1.0     tools_4.2.1         
##  [79] cachem_1.0.6         cli_3.3.0            generics_0.1.3      
##  [82] sjlabelled_1.2.0     broom_1.0.0          ggridges_0.5.3      
##  [85] evaluate_0.15        fastmap_1.1.0        yaml_2.3.5          
##  [88] processx_3.7.0       fs_1.5.2             robustbase_0.95-0   
##  [91] nlme_3.1-158         mime_0.12            xml2_1.3.3          
##  [94] compiler_4.2.1       pbkrtest_0.5.1       bayesplot_1.9.0     
##  [97] shinythemes_1.2.0    rstudioapi_0.13      reprex_2.0.1        
## [100] bslib_0.4.0          stringi_1.7.8        parameters_0.18.1   
## [103] ps_1.7.1             Brobdingnag_1.2-7    nloptr_2.0.3        
## [106] markdown_1.1         shinyjs_2.1.0        tensorA_0.36.2      
## [109] vctrs_0.4.1          pillar_1.8.0         lifecycle_1.0.1     
## [112] jquerylib_0.1.4      estimability_1.4     insight_0.18.0      
## [115] httpuv_1.6.5         R6_2.5.1             promises_1.2.0.1    
## [118] parallelly_1.32.1    codetools_0.2-18     boot_1.3-28         
## [121] colourpicker_1.1.1   MASS_7.3-57          gtools_3.9.3        
## [124] assertthat_0.2.1     withr_2.5.0          shinystan_2.6.0     
## [127] bayestestR_0.12.1    mgcv_1.8-40          parallel_4.2.1      
## [130] hms_1.1.1            grid_4.2.1           coda_0.19-4         
## [133] minqa_1.2.4          rmarkdown_2.14       googledrive_2.0.0   
## [136] numDeriv_2016.8-1.1  shiny_1.7.1          lubridate_1.8.0     
## [139] base64enc_0.1-3      dygraphs_1.1.1.6
# Read data ======================================
col_types <- cols(
  labid = col_factor(),
  subid = col_factor(),
  subid_unique = col_factor(),
  CDI.form = col_factor(),
  CDI.nwords = col_integer(),
  CDI.prop = col_number(),
  CDI.agerange = col_factor(),
  CDI.agedays = col_integer(),
  CDI.agemin = col_integer(),
  CDI.agemax = col_integer(),
  vocab_nwords = col_integer(),
  standardized.score.CDI = col_character(),
  standardized.score.CDI.num = col_number(),
  IDS_pref = col_number(),
  language = col_factor(),
  language_zone = col_factor(),
  CDI.error = col_logical(),
  Notes = col_character(),
  trial_order = col_factor(),
  method = col_factor(),
  age_days = col_integer(),
  age_mo = col_number(),
  age_group = col_factor(),
  nae = col_logical(),
  gender = col_factor(),
  second_session = col_logical()
)
data.total <- read_csv("data/02b_processed.csv", col_types = col_types)

Before moving on with the analysis, we have to ready the data by (a) checking for colinearity between z_age_months and CDI.z_age_months and correcting this if necessary, and (b) setting up the contrasts described in our data analysis.

1.1 Colinearity check

First, we run a Kappa test on the possibility of colinearity between z_age_months and CDI.z_age_months.

# Run kappa test
k.age_months <- model.matrix(~ z_age_months + CDI.z_age_months, data = data.total) %>%
  kappa(exact = T)

With a value of 3.1953332, we do not have a colinearity issue and can proceed with the data analysis as planned (The criteria of indicating colinearity is that kappa > 10).

1.2 Contrast Setups

We need gender as an effect-coded factor, and method as a deviation-coded factor. This is achieved in R by using the contr.sum() function with the number of levels for each factor. Notably, when subsetting the UK sample, only two levels of method out of the three in total were left.

# Set contrasts on the total dataset =============
contrasts(data.total$gender) <- contr.sum(2)
contrasts(data.total$method) <- contr.sum(3)

# Create sub-datasets, with contrasts ============
## NAE
data.nae <- data.total %>%
  subset(language_zone == "NAE") %>%
  droplevels()
contrasts(data.nae$gender) <- contr.sum(2)
contrasts(data.nae$method) <- contr.sum(3)

## UK (combined-age and separate 18/24 months)

data.uk <- data.total %>%
  subset(language_zone == "British") %>%
  droplevels()
contrasts(data.uk$gender) <- contr.sum(2)
contrasts(data.uk$method) <- contr.sum(2) # note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels


data.uk.18 <- data.total %>%
  subset(language_zone == "British" & CDI.agerange ==
    "18") %>%
  droplevels()
contrasts(data.uk.18$gender) <- contr.sum(2)
contrasts(data.uk.18$method) <- contr.sum(2) # note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels

data.uk.24 <- data.total %>%
  subset(language_zone == "British" & CDI.agerange ==
    "24") %>%
  droplevels()
contrasts(data.uk.24$gender) <- contr.sum(2)
contrasts(data.uk.24$method) <- contr.sum(2) # note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels



## Other
data.other <- data.total %>%
  subset(language_zone == "Other") %>%
  droplevels()
contrasts(data.other$gender) <- contr.sum(2)
contrasts(data.other$method) <- contr.sum(3)

2 Descriptive Statistics

We first assess the amount of data we have overall per condition and their shape overall.

data.total %>%
  group_by(language_zone, CDI.agerange, method, gender) %>%
  summarise(N = n(), age = mean(CDI.agedays), sd = sd(CDI.agedays)) %>%
  kable()
## `summarise()` has grouped output by 'language_zone', 'CDI.agerange', 'method'.
## You can override using the `.groups` argument.
language_zone CDI.agerange method gender N age sd
British 18 singlescreen F 24 551.2083 10.741950
British 18 singlescreen M 24 550.5000 13.497182
British 18 eyetracking F 9 548.5556 9.139353
British 18 eyetracking M 11 547.8182 10.147100
British 24 singlescreen F 18 741.6111 13.128948
British 24 singlescreen M 15 745.0667 15.549307
British 24 eyetracking F 13 731.5385 12.162890
British 24 eyetracking M 5 737.8000 8.228001
Other 18 singlescreen F 11 541.5455 7.160498
Other 18 singlescreen M 13 544.3077 6.663140
Other 18 eyetracking F 26 556.0769 14.133430
Other 18 eyetracking M 30 560.8333 16.128970
Other 18 hpp F 38 549.0526 10.812774
Other 18 hpp M 38 555.1579 13.664961
Other 24 singlescreen F 10 731.3000 14.802777
Other 24 singlescreen M 12 721.1667 13.888081
Other 24 eyetracking F 28 730.1786 9.996494
Other 24 eyetracking M 25 730.1600 7.711896
Other 24 hpp F 45 730.7333 17.357733
Other 24 hpp M 35 730.5143 15.849237
NAE 18 singlescreen F 19 554.9474 20.780530
NAE 18 singlescreen M 14 581.2143 24.925052
NAE 18 eyetracking F 1 549.0000 NA
NAE 18 hpp F 56 560.9821 21.584920
NAE 18 hpp M 57 559.6140 21.163272
NAE 24 singlescreen F 23 737.1739 26.604258
NAE 24 singlescreen M 20 739.6000 21.352678
NAE 24 eyetracking F 2 756.5000 34.648232
NAE 24 eyetracking M 1 745.0000 NA
NAE 24 hpp F 54 733.9630 27.476895
NAE 24 hpp M 60 727.9500 27.409899

More detailed information about Descriptive Statistics

# number of lab
data.total %>%
  select(labid, language_zone) %>%
  unique() %>%
  group_by(language_zone) %>%
  count()
## # A tibble: 3 × 2
## # Groups:   language_zone [3]
##   language_zone     n
##   <fct>         <int>
## 1 British           4
## 2 Other             8
## 3 NAE               8
data.total %>%
  group_by(language_zone, CDI.agerange) %>%
  summarize(N = n())
## `summarise()` has grouped output by 'language_zone'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 3
## # Groups:   language_zone [3]
##   language_zone CDI.agerange     N
##   <fct>         <fct>        <int>
## 1 British       18              68
## 2 British       24              51
## 3 Other         18             156
## 4 Other         24             155
## 5 NAE           18             147
## 6 NAE           24             160
# age range in each age group and language zone
data.total %>%
  select(subid, language_zone, CDI.agedays, CDI.agerange) %>%
  unique() %>%
  group_by(language_zone, CDI.agerange) %>%
  summarize(
    age_min = (min(CDI.agedays) / 365.25 * 12),
    age_max = (max(CDI.agedays) / 365.25 * 12)
  )
## `summarise()` has grouped output by 'language_zone'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 4
## # Groups:   language_zone [3]
##   language_zone CDI.agerange age_min age_max
##   <fct>         <fct>          <dbl>   <dbl>
## 1 British       18              17.4    19.2
## 2 British       24              23.4    25.1
## 3 Other         18              17.4    19.5
## 4 Other         24              22.5    25.2
## 5 NAE           18              16.6    20.0
## 6 NAE           24              22.2    25.9

We then assess the data per lab in terms of sample size and CDI score (vocabulary size, for consistency between language zones).

by_lab <- data.total %>%
  group_by(labid, language_zone, CDI.agerange) %>%
  mutate(tested = n_distinct(subid_unique)) %>%
  select(labid, language_zone, CDI.agerange, tested, vocab_nwords) %>%
  nest(scores = vocab_nwords) %>%
  mutate(
    model = map(scores, ~ lm(vocab_nwords ~ 1, data = .x)),
    ci = map(model, confint)
  ) %>%
  transmute(
    tested = tested,
    mean = map_dbl(model, ~ coefficients(.x)[[1]]),
    ci_lower = map_dbl(ci, 1),
    ci_upper = map_dbl(ci, 2)
  ) %>%
  arrange(language_zone) %>%
  rownames_to_column() %>%
  ungroup()

2.1 Visualization by Lab

# created a new column with the labs IDs as character so it can be sorted in alphabetical order
by_lab$labid_car <- as.character(by_lab$labid)

# relabel the CDI age factor column
levels(by_lab$CDI.agerange) <- c("18 months", "24 months")

# Making sure the factor columns have levels in the order I would like to graph them
by_lab$language_zone <- factor(by_lab$language_zone, levels = c("British", "NAE", "Other"))

# sorted the idcolum in the way I would it to show in the ggplot
labid_ord <- by_lab %>%
  dplyr::arrange(language_zone, desc(labid_car)) %>%
  ungroup() %>%
  filter(CDI.agerange == "18 months") %>%
  select(labid)

# Making sure the factor columns have levels in the order I would like to graph them
by_lab$labid <- factor(by_lab$labid, levels = labid_ord$labid)

## graph by language zone and lab Id in asc order
by_lab %>%
  ggplot(
    .,
    aes(
      x = labid,
      y = mean, colour = language_zone,
      ymin = ci_lower, ymax = ci_upper
    )
  ) +
  geom_linerange() +
  geom_point(aes(size = tested)) +
  coord_flip(ylim = c(0, 500)) +
  xlab("Lab") +
  ylab("Vocabulary size") +
  scale_colour_brewer(
    palette = "Dark2", name = "Language zone",
    breaks = c("British", "NAE", "Other")
  ) +
  scale_size_continuous(name = "N", breaks = function(x) c(min(x), mean(x), max(x))) +
  facet_wrap(vars(CDI.agerange)) +
  theme(
    legend.position = "bottom",
    strip.background = element_blank(),
    strip.placement = "outside",
    text = element_text(size = 22)
  )

ggsave("plots/vocab-size_by-lab.png", width = 12, height = 8)

3 Sample Theory Based Statistics

3.1 Simple Correlation

First, we want to assess quickly if there is a direct correlation between IDS preference and CDI score, computing a Pearson#’s product-moment correlation. We use standardized CDI scores for the North American sample, and raw scores for the British sample. Since CDI grows with age, we run the British sample separately for 18 and 24 months.

# Statistics =====================================
## North American Sample
test.pearson.nae <- cor.test(data.nae$IDS_pref,
  data.nae$z_standardized_CDI,
  alternative = "two.sided", method = "pearson"
)

test.pearson.nae
## 
##  Pearson's product-moment correlation
## 
## data:  data.nae$IDS_pref and data.nae$z_standardized_CDI
## t = -0.91847, df = 305, p-value = 0.3591
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.16349833  0.05977293
## sample estimates:
##         cor 
## -0.05251901
## UK Sample
test.pearson.uk.18 <- cor.test(data.uk.18$IDS_pref,
  data.uk.18$z_vocab_nwords,
  alternative = "two.sided", method = "pearson"
)

test.pearson.uk.18
## 
##  Pearson's product-moment correlation
## 
## data:  data.uk.18$IDS_pref and data.uk.18$z_vocab_nwords
## t = 0.70855, df = 66, p-value = 0.4811
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1547438  0.3187097
## sample estimates:
##        cor 
## 0.08688698
test.pearson.uk.24 <- cor.test(data.uk.24$IDS_pref,
  data.uk.24$z_vocab_nwords,
  alternative = "two.sided", method = "pearson"
)

test.pearson.uk.24
## 
##  Pearson's product-moment correlation
## 
## data:  data.uk.24$IDS_pref and data.uk.24$z_vocab_nwords
## t = 0.36606, df = 49, p-value = 0.7159
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2266228  0.3231553
## sample estimates:
##        cor 
## 0.05222234

Plots for correlation

## North American Sample
### Get correlation value for annotation
cor_text <- "paste(italic(R)^2, \" =\")"
cor_value <- round(test.pearson.nae$estimate, 3)

### Build plot
plot.pearson.nae <- data.nae %>%
  ggplot(aes(
    x = IDS_pref,
    y = standardized.score.CDI.num
  )) +
  xlab("IDS preference") +
  ylab("Standardized CDI score") +
  geom_point() +
  geom_smooth(method = lm) +
  annotate("text",
    x = -.9, y = 51, parse = T, size = 4,
    label = paste(cor_text, cor_value, sep = "~")
  )

## UK Sample
cor_value <- round(test.pearson.uk.18$estimate, 3)
plot.pearson.uk.18 <- data.uk.18 %>%
  ggplot(aes(
    x = IDS_pref,
    y = vocab_nwords
  )) +
  xlab("IDS preference") +
  ylab("Vocabulary size (in words)") +
  geom_point() +
  geom_smooth(method = lm) +
  annotate("text",
    x = .8, y = 150, parse = T, size = 4,
    label = paste(cor_text, cor_value, sep = "~")
  )

cor_value <- round(test.pearson.uk.24$estimate, 3)
plot.pearson.uk.24 <- data.uk.24 %>%
  ggplot(aes(
    x = IDS_pref,
    y = vocab_nwords
  )) +
  xlab("IDS preference") +
  ylab("Vocabulary size (in words)") +
  geom_point() +
  geom_smooth(method = lm) +
  annotate("text",
    x = .8, y = 150, parse = T, size = 4,
    label = paste(cor_text, cor_value, sep = "~")
  )

# Global plot
plot.pearson <- ggarrange(plot.pearson.nae, plot.pearson.uk.18, plot.pearson.uk.24, ncol = 3)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

plot.pearson

# TODO: These plots need to be cleaned up!

ggsave("plots/corr_plot.pdf", plot.pearson,
  units = "mm", width = 180, height = 100, dpi = 1000
)

We see no obvious direct link between IDS prefernce and CDI score here. However, an effect might appear once we take into account various factors that might interact with IDS preference and/or CDI score. We can also first enhance these plots with information about the age group at which infants were tested (18- or 24-month-old) for the NAE sample, using vocabulary size to better compare the NAE and UK samples.

plot.age_group <- data.total %>%
  subset(language_zone != "Other") %>%
  droplevels() %>%
  ggplot(aes(
    x = IDS_pref,
    y = vocab_nwords,
    colour = CDI.agerange
  )) +
  facet_wrap(vars(language_zone),
    labeller = as_labeller(c(
      "British" = "UK samples",
      "NAE" = "North Amercian samples"
    ))
  ) +
  xlab("Standardized IDS prefence score") +
  ylab("Vocabulary size (in words)") +
  theme(legend.position = "top") +
  geom_point() +
  geom_smooth(method = lm) +
  scale_colour_brewer(
    palette = "Dark2", name = "Age group",
    breaks = c("18", "24"),
    labels = c("18mo", "24m")
  )
ggsave("plots/scatter_age.pdf", plot.age_group,
  units = "mm", width = 180, height = 100, dpi = 1000
)
## `geom_smooth()` using formula 'y ~ x'
(plot.age_group)
## `geom_smooth()` using formula 'y ~ x'

3.2 Mixed-Effects Model by Language Zone

Here, we run a mixed-effects model including only theoretically motivated effects, as described in the pre-registration. We start with the full model bellow, simplifying the random effects structure until it converges.

3.2.1 NAE full model

# Run models =====================================
## NAE

data.nae$centered_IDS_pref <- scale(data.nae$IDS_pref, center = T, scale = F)

lmer.full.nae <- lmer(standardized.score.CDI.num ~ CDI.z_age_months + gender +
  z_age_months + method + centered_IDS_pref +
  centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months + centered_IDS_pref:z_age_months +
  (1 | labid) + (1 | subid_unique),
data = data.nae
)

summary(lmer.full.nae)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## standardized.score.CDI.num ~ CDI.z_age_months + gender + z_age_months +  
##     method + centered_IDS_pref + centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months +  
##     centered_IDS_pref:z_age_months + (1 | labid) + (1 | subid_unique)
##    Data: data.nae
## 
## REML criterion at convergence: 2806.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.2240 -0.4550 -0.0619  0.4779  2.5238 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  subid_unique (Intercept) 531.62   23.06   
##  labid        (Intercept)  37.34    6.11   
##  Residual                 246.81   15.71   
## Number of obs: 307, groups:  subid_unique, 211; labid, 8
## 
## Fixed effects:
##                                     Estimate Std. Error        df t value
## (Intercept)                         32.68863    6.42717  48.76105   5.086
## CDI.z_age_months                     0.04467    0.35954 121.35781   0.124
## gender1                             -1.51119    1.88476 193.54899  -0.802
## z_age_months                        -0.02603    0.62952  69.26913  -0.041
## method1                              6.45062    6.51547 121.14260   0.990
## method2                            -13.61370   11.77122 176.84942  -1.157
## centered_IDS_pref                  -33.98215   24.82232 210.07506  -1.369
## method1:centered_IDS_pref           28.35879   25.46036 210.29032   1.114
## method2:centered_IDS_pref          -59.58231   49.55101 209.34368  -1.202
## CDI.z_age_months:centered_IDS_pref   0.41746    1.10252 123.90420   0.379
## z_age_months:centered_IDS_pref       2.01470    2.07816 189.23347   0.969
##                                    Pr(>|t|)    
## (Intercept)                        5.82e-06 ***
## CDI.z_age_months                      0.901    
## gender1                               0.424    
## z_age_months                          0.967    
## method1                               0.324    
## method2                               0.249    
## centered_IDS_pref                     0.172    
## method1:centered_IDS_pref             0.267    
## method2:centered_IDS_pref             0.231    
## CDI.z_age_months:centered_IDS_pref    0.706    
## z_age_months:centered_IDS_pref        0.334    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) CDI.z__ gendr1 z_g_mn methd1 methd2 c_IDS_ m1:_ID m2:_ID
## CDI.z_g_mnt -0.056                                                         
## gender1     -0.033  0.022                                                  
## z_age_mnths  0.051 -0.037  -0.023                                          
## method1     -0.711  0.001   0.022  0.106                                   
## method2      0.878 -0.031  -0.037  0.017 -0.875                            
## cntrd_IDS_p  0.330  0.007   0.039 -0.040 -0.323  0.359                     
## mthd1:_IDS_ -0.310 -0.029  -0.027  0.019  0.335 -0.357 -0.927              
## mthd2:_IDS_  0.324  0.022   0.060 -0.010 -0.336  0.363  0.966 -0.971       
## CDI.__:_IDS -0.010 -0.006  -0.019 -0.018 -0.030  0.010 -0.047  0.027 -0.047
## z_g_m:_IDS_ -0.041  0.028   0.103  0.027 -0.035  0.019  0.035 -0.086  0.132
##             CDI.__:
## CDI.z_g_mnt        
## gender1            
## z_age_mnths        
## method1            
## method2            
## cntrd_IDS_p        
## mthd1:_IDS_        
## mthd2:_IDS_        
## CDI.__:_IDS        
## z_g_m:_IDS_ -0.071
# robust_lmer.full.nae <- robustlmm::rlmer(standardized.score.CDI.num ~ CDI.z_age_months + gender +
#                         z_age_months + method + centered_IDS_pref +
#                         centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months + centered_IDS_pref:z_age_months +
#                         (1 | labid),
#                       data = data.nae)
#
#
# summary(robust_lmer.full.nae) #this model is used to see if we can meet some statistical assumption better but we decided to use the original model as the inferential statistical results are consistent

full.nae_pvalue <- anova(lmer.full.nae) %>%
  as_tibble(rownames = "Parameter") # this gives us the Type III p values

# ==========
# Sequentially removed random effects:
# IDS_pref:z_age_months
# IDS_pref:CDI.z_age_months
# IDS_pref:method
# IDS_pref
# method
# z_age_months
# gender
# CDI.z_age_months
# ==========

3.2.1.1 (Optional) Checking mixed-model assumptions. We will check the following:

  1. Linearlity
  2. Normality of the residuals
  3. Homoscedasticity of residuals
  4. No autocorrelation
  5. No multicollinearity
# First, check linearity
# data.nae$resid <- residuals(lmer.full.nae)
#
# plot(data.nae$resid, data.nae$standardized.score.CDI)

# Second, check normality
plot_model(lmer.full.nae, type = "diag") ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula 'y ~ x'

## 
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula 'y ~ x'

## 
## [[2]]$labid
## `geom_smooth()` using formula 'y ~ x'

## 
## 
## [[3]]

## 
## [[4]]
## `geom_smooth()` using formula 'y ~ x'

# Third, check autocorrelation
re_run_lme.full.nae <- nlme::lme(standardized.score.CDI.num ~ CDI.z_age_months + gender +
  z_age_months + method + centered_IDS_pref +
  centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months + centered_IDS_pref:z_age_months,
random = ~ 1 | labid,
method = "REML",
data = data.nae, na.action = na.exclude
)

plot(nlme::ACF(re_run_lme.full.nae, resType = "normalized")) # there is no sign for autocorrelation

# Lastly, check multi-collinearity
car::vif(lmer.full.nae) # we do see a multicollineartiy for the IDS preference variable, even though we have centered the IDS preference score. It is probably related to the number the participating labs (as this is the group level that we are controlling) and how we entered interaction between IDS preference and other variables (that lack variability in the current sample). We need to keep IDS preference in the model as exploring the relationship between IDS preference and CDI score is the key research question in the paper.
##                                         GVIF Df GVIF^(1/(2*Df))
## CDI.z_age_months                    1.009197  1        1.004588
## gender                              1.039338  1        1.019479
## z_age_months                        1.095177  1        1.046507
## method                              1.258318  2        1.059126
## centered_IDS_pref                  19.302592  1        4.393472
## method:centered_IDS_pref           20.839041  2        2.136581
## CDI.z_age_months:centered_IDS_pref  1.014438  1        1.007193
## z_age_months:centered_IDS_pref      1.264833  1        1.124648

3.2.2 UK full model

lmer.full.uk <- lmer(vocab_nwords ~ CDI.z_age_months + gender +
  z_age_months + method + IDS_pref +
  IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months +
  (1 | labid) + (1 | subid_unique),
data = data.uk
)

summary(lmer.full.uk)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: vocab_nwords ~ CDI.z_age_months + gender + z_age_months + method +  
##     IDS_pref + IDS_pref:method + IDS_pref:CDI.z_age_months +  
##     IDS_pref:z_age_months + (1 | labid) + (1 | subid_unique)
##    Data: data.uk
## 
## REML criterion at convergence: 1326
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.78175 -0.38449  0.01193  0.37856  1.94702 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  subid_unique (Intercept) 5412.10  73.567  
##  labid        (Intercept)   33.24   5.765  
##  Residual                 2637.21  51.354  
## Number of obs: 119, groups:  subid_unique, 88; labid, 4
## 
## Fixed effects:
##                           Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)                175.945     11.558   3.774  15.223 0.000158 ***
## CDI.z_age_months            28.905      1.964  44.974  14.714  < 2e-16 ***
## gender1                     19.043      9.820  78.517   1.939 0.056074 .  
## z_age_months                -5.065      3.455  70.444  -1.466 0.147087    
## method1                    -11.249     13.255   5.502  -0.849 0.431430    
## IDS_pref                    10.003     26.512  83.053   0.377 0.706906    
## method1:IDS_pref            -7.789     29.223  87.125  -0.267 0.790455    
## CDI.z_age_months:IDS_pref    1.573      5.726  50.812   0.275 0.784676    
## z_age_months:IDS_pref        1.154      8.805  89.735   0.131 0.896025    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) CDI.z__ gendr1 z_g_mn methd1 IDS_pr m1:IDS CDI.__:
## CDI.z_g_mnt  0.127                                                   
## gender1      0.043 -0.069                                            
## z_age_mnths  0.013 -0.099  -0.019                                    
## method1     -0.383 -0.074  -0.090  0.490                             
## IDS_pref    -0.362 -0.076  -0.118  0.023  0.197                      
## mthd1:IDS_p  0.186  0.094   0.007 -0.117 -0.303 -0.193               
## CDI.__:IDS_ -0.100 -0.328   0.056  0.064  0.094  0.096 -0.136        
## z_g_mn:IDS_ -0.033  0.079  -0.244 -0.374 -0.099 -0.087  0.431 -0.228
full.uk_pvalue <- anova(lmer.full.uk) %>%
  as_tibble(rownames = "Parameter") # this gives us the Type III p values

# ==========
# Sequentially removed random effects:
# IDS_pref:z_age_months
# IDS_pref:CDI.z_age_months
# IDS_pref:method
# IDS_pref
# method
# z_age_months
# gender
# CDI.z_age_months

3.2.2.1 (Optional) Checking mixed-model assumptions. We will check the following:

  1. Linearlity
  2. Normality of the residuals
  3. Homoscedasticity of residuals
  4. No autocorrelation
  5. No multicollinearity
# First, check linearity. The plot looks linear
data.uk$resid <- residuals(lmer.full.uk)

plot(data.uk$resid, data.uk$vocab_nwords)

# Second, check normality
plot_model(lmer.full.uk, type = "diag") ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula 'y ~ x'

## 
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula 'y ~ x'

## 
## [[2]]$labid
## `geom_smooth()` using formula 'y ~ x'

## 
## 
## [[3]]

## 
## [[4]]
## `geom_smooth()` using formula 'y ~ x'

# Third, check autocorrelation
re_run_lme.full.uk <- nlme::lme(vocab_nwords ~ CDI.z_age_months + gender +
  z_age_months + method + IDS_pref +
  IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months,
random = ~ 1 | labid,
method = "REML",
data = data.nae, na.action = na.exclude
)

plot(nlme::ACF(re_run_lme.full.uk, resType = "normalized")) # there is no sign for autocorrelation

# Lastly, check multi-collinearity
car::vif(lmer.full.uk) # no problem for multicollinearlity
##          CDI.z_age_months                    gender              z_age_months 
##                  1.142786                  1.125370                  1.678295 
##                    method                  IDS_pref           method:IDS_pref 
##                  1.592431                  1.095868                  1.457528 
## CDI.z_age_months:IDS_pref     z_age_months:IDS_pref 
##                  1.189684                  1.730676

We now want to check the statistical power of significant effects, and discard any models with significant effects that do not reach 80% power. This however leads to too many warnings of singularity issues on the model updates inherent to the simr power simulations, hence we cannot obtain satisfactory power estimates as pre-registered.

AST: Note that we don’t have any IV(s) that turned out to be significant in the Full NAE model. So we won’t run the power analysis check. For the UK full model, there are two statistically significant IV: CDI_age and gender. The post hoc power check suggested that we have high power in detecting the effect of CDI_age but not gender. Note that gender has a smaller effect size to begin with, so this may partially explain why we have less power in detecting it in the model. As there can be a number of different factors that determines the posthoc power, we decided not to remove gender in the model based on posthoc power analysis check.

check_pwr_uk_cdi_age <- simr::powerSim(lmer.full.uk, test = fixed("CDI.z_age_months", method = "z"), seed = 2, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into

check_pwr_uk_cdi_age

check_pwr_uk_gender <- simr::powerSim(lmer.full.uk, test = fixed("gender", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into

check_pwr_uk_gender

3.2.3 Combined Sample

For this combined analysis, we first need to restrain the age range for the NAE sample (previously ±2 months, now ±1.5 months). [Note: This should be +/- 0.5 months…]

# Create dataset with British and NAE only
data.uk_nae <- data.total %>%
  subset(language_zone %in% c("British", "NAE")) %>%
  mutate(
    CDI.agemin = ifelse(language_zone == "NAE",
      CDI.agemin + round(.5 * 365.25 / 12),
      CDI.agemin
    ),
    CDI.agemax = ifelse(language_zone == "NAE",
      CDI.agemax - round(.5 * 365.25 / 12),
      CDI.agemax
    )
  ) %>%
  subset(!(CDI.agedays < CDI.agemin | CDI.agedays > CDI.agemax)) %>%
  droplevels()
# Create contrasts for analysis
contrasts(data.uk_nae$gender) <- contr.sum(2)
contrasts(data.uk_nae$method) <- contr.sum(3)
contrasts(data.uk_nae$language_zone) <- contr.sum(2)

We can then run the planned combined analysis adding the main effect and interactions of language_zone.

lmer.full.uk_nae <- lmer(CDI.prop ~ CDI.z_age_months + language_zone + gender +
  z_age_months + method + IDS_pref + IDS_pref:language_zone +
  IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months +
  (1 + CDI.z_age_months | labid) + (1 | subid_unique),
data = data.uk_nae
)

summary(lmer.full.uk_nae)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: CDI.prop ~ CDI.z_age_months + language_zone + gender + z_age_months +  
##     method + IDS_pref + IDS_pref:language_zone + IDS_pref:method +  
##     IDS_pref:CDI.z_age_months + IDS_pref:z_age_months + (1 +  
##     CDI.z_age_months | labid) + (1 | subid_unique)
##    Data: data.uk_nae
## 
## REML criterion at convergence: -56
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.86767 -0.46070 -0.05384  0.41409  2.63524 
## 
## Random effects:
##  Groups       Name             Variance  Std.Dev. Corr
##  subid_unique (Intercept)      0.0283551 0.16839      
##  labid        (Intercept)      0.0005729 0.02394      
##               CDI.z_age_months 0.0010798 0.03286  0.45
##  Residual                      0.0173293 0.13164      
## Number of obs: 401, groups:  subid_unique, 286; labid, 12
## 
## Fixed effects:
##                             Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)                 0.322978   0.018753   8.939085  17.223 3.66e-08 ***
## CDI.z_age_months            0.043288   0.010038  11.995084   4.312  0.00101 ** 
## language_zone1              0.088047   0.022622   8.452255   3.892  0.00413 ** 
## gender1                     0.035741   0.012381 252.339085   2.887  0.00423 ** 
## z_age_months               -0.002581   0.004453 168.109225  -0.580  0.56289    
## method1                    -0.007231   0.025155  16.577207  -0.287  0.77732    
## method2                     0.008879   0.036872  12.524954   0.241  0.81360    
## IDS_pref                    0.031786   0.042931 276.643855   0.740  0.45970    
## language_zone1:IDS_pref     0.033524   0.057322 295.425539   0.585  0.55910    
## method1:IDS_pref           -0.038658   0.055555 298.380191  -0.696  0.48706    
## method2:IDS_pref           -0.016639   0.088523 285.347212  -0.188  0.85104    
## CDI.z_age_months:IDS_pref   0.006813   0.008220 158.076039   0.829  0.40841    
## z_age_months:IDS_pref      -0.001588   0.012432 265.483177  -0.128  0.89847    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 13 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
combined.full.uk_nae_pvalue <- anova(lmer.full.uk_nae) %>%
  as_tibble(rownames = "Parameter") # this gives us the Type III p values

# ==========
# Sequentially removed random effects:
# IDS_pref:z_age_months
# IDS_pref:CDI.z_age_months
# IDS_pref:method
# IDS_pref:language_zone
# IDS_pref
# method
# z_age_months
# gender
# language_zone
# ==========

3.2.3.1 (Optional) Checking mixed-model assumptions

  1. Linearlity
  2. Normality of the residuals
  3. Homoscedasticity of residuals
  4. No autocorrelation
  5. No multicollinearity
# First, check linearity. The plot looks linear
data.uk_nae$resid <- residuals(lmer.full.uk_nae)

plot(data.uk_nae$resid, data.uk_nae$CDI.prop)

# Second, check normality
plot_model(lmer.full.uk_nae, type = "diag") ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula 'y ~ x'

## 
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula 'y ~ x'

## 
## [[2]]$labid
## `geom_smooth()` using formula 'y ~ x'

## 
## 
## [[3]]

## 
## [[4]]
## `geom_smooth()` using formula 'y ~ x'

# Third, check autocorrelation
re_run_lme.full.uk_nae <- nlme::lme(CDI.prop ~ CDI.z_age_months + language_zone + gender +
  z_age_months + method + IDS_pref + IDS_pref:language_zone +
  IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months,
random = ~ CDI.z_age_months | labid,
method = "REML",
data = data.uk_nae, na.action = na.exclude
)

plot(nlme::ACF(re_run_lme.full.uk_nae, resType = "normalized")) # there is no sign for autocorrelation

# Lastly, check multi-collinearity
car::vif(lmer.full.uk_nae) # no problem for multicollinearlity
##                               GVIF Df GVIF^(1/(2*Df))
## CDI.z_age_months          1.015449  1        1.007695
## language_zone             2.209327  1        1.486380
## gender                    1.035132  1        1.017414
## z_age_months              1.488905  1        1.220207
## method                    3.162695  2        1.333565
## IDS_pref                  1.419845  1        1.191572
## language_zone:IDS_pref    2.765706  1        1.663041
## method:IDS_pref           3.755363  2        1.392076
## CDI.z_age_months:IDS_pref 1.048852  1        1.024135
## z_age_months:IDS_pref     1.496611  1        1.223361

We then compute \(p\)-values, but leave out power estimates for those \(p\)-values as above. Again, we have a lot of singular fit issues for the power checks and decided not to remove parameters based on posthoc power analysis.

check_pwr_combined_cdi_age <- simr::powerSim(lmer.full.uk_nae, test = fixed("CDI.z_age_months", method = "z"), seed = 2, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into

check_pwr_combined_cdi_age

check_pwr_combined_lang_zone <- simr::powerSim(lmer.full.uk_nae, test = fixed("language_zone", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into

check_pwr_combined_lang_zone

check_pwr_combined_gender <- simr::powerSim(lmer.full.uk_nae, test = fixed("gender", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) # specify that Gender is the ixed effect that we are looking into

check_pwr_combined_gender